Tian et al. 2011 Model 1¶

This file aims to reproduce the findings of Tian et al. 2011, "Genome-wide association study of leaf architecture in the

maize nested association mapping population".

In [ ]:
use_gpu_num = 1

import os
import pandas as pd
import numpy as np
import re

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch import nn

device = "cuda" if torch.cuda.is_available() else "cpu"
if use_gpu_num in [0, 1]: 
    torch.cuda.set_device(use_gpu_num)
print(f"Using {device} device")

import tqdm
from tqdm import tqdm

import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"


import dlgwas
from dlgwas.kegg import ensure_dir_path_exists
from dlgwas.kegg import get_cached_result
from dlgwas.kegg import put_cached_result

from dlgwas.dlfn import calc_cs
from dlgwas.dlfn import apply_cs
from dlgwas.dlfn import reverse_cs
/home/labmember/mambaforge/envs/pytorch_mamba/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Using cuda device
In [ ]:

In [ ]:
# set up directory for notebook artifacts
nb_name = '11_TianEtAl2011'

# import shutil
# # to rerun from scratch:
# shutil.rmtree('../models/'+nb_name, ignore_errors=True)
# shutil.rmtree('../reports/'+nb_name, ignore_errors=True)

ensure_dir_path_exists(dir_path = '../models/'+nb_name)
ensure_dir_path_exists(dir_path = '../reports/'+nb_name)

Load Cleaned Data¶

In [ ]:
# Read in cleaned data
taxa_groupings = pd.read_csv('../models/10_TianEtAl2011/taxa_groupings.csv')
data           = pd.read_csv('../models/10_TianEtAl2011/clean_data.csv')
In [ ]:
# Define holdout sets (Populations)
uniq_pop = list(set(taxa_groupings['Population']))
print(str(len(uniq_pop))+" Unique Holdout Groups.")
taxa_groupings['Holdout'] = None
for i in range(len(uniq_pop)):
    mask = (taxa_groupings['Population'] == uniq_pop[i])
    taxa_groupings.loc[mask, 'Holdout'] = i

taxa_groupings
25 Unique Holdout Groups.
Out[ ]:
Unnamed: 0 sample Population Holdout
0 0 Z001E0001 B73 x B97 18
1 1 Z001E0002 B73 x B97 18
2 2 Z001E0003 B73 x B97 18
3 3 Z001E0004 B73 x B97 18
4 4 Z001E0005 B73 x B97 18
... ... ... ... ...
4671 4671 Z026E0196 B73 x Tzi8 4
4672 4672 Z026E0197 B73 x Tzi8 4
4673 4673 Z026E0198 B73 x Tzi8 4
4674 4674 Z026E0199 B73 x Tzi8 4
4675 4675 Z026E0200 B73 x Tzi8 4

4676 rows × 4 columns

Setup Holdouts¶

In [ ]:
#randomly holdout a population if there is not a file with the population held out.
# Holdout_Int = 0
Holdout_Int_path = '../models/'+nb_name+'/holdout_pop_int.pkl'
if None != get_cached_result(Holdout_Int_path):
    Holdout_Int = get_cached_result(Holdout_Int_path)
else:
    Holdout_Int = int(np.random.choice([i for i in range(len(uniq_pop))], 1))
    put_cached_result(Holdout_Int_path, Holdout_Int)
In [ ]:
print("Holding out i="+str(Holdout_Int)+": "+uniq_pop[Holdout_Int])

mask = (taxa_groupings['Holdout'] == Holdout_Int)
train_idxs = list(taxa_groupings.loc[~mask, ].index)
test_idxs = list(taxa_groupings.loc[mask, ].index)
Holding out i=14: B73 x Il14H
In [ ]:
[len(e) for e in [test_idxs, train_idxs]]
Out[ ]:
[194, 4482]
In [ ]:
downsample_obs = True
train_n = 900
test_n = 100


if downsample_obs == True:
    train_idxs = np.random.choice(train_idxs, train_n)
    test_idxs = np.random.choice(test_idxs, test_n)
    print([len(e) for e in [test_idxs, train_idxs]])
[100, 900]
In [ ]:
 
In [ ]:
# used to go from index in tensor to index in data so that the right xs tensor can be loaded in
idx_original = np.array(data.index)

y1 = data['leaf_length']
y2 = data['leaf_width']
y3 = data['upper_leaf_angle']
y1 = np.array(y1)
y2 = np.array(y2)
y3 = np.array(y3)

Scale data¶

In [ ]:
scale_dict = {
    'y1':calc_cs(y1[train_idxs]),
    'y2':calc_cs(y2[train_idxs]),
    'y3':calc_cs(y3[train_idxs])
}
In [ ]:
y1 = apply_cs(y1, scale_dict['y1'])
y2 = apply_cs(y2, scale_dict['y2'])
y3 = apply_cs(y3, scale_dict['y3'])
In [ ]:
 

Allow for cycling data onto and off of GPU¶

In [ ]:
# loading this into memory causes the session to crash

y1_train = torch.from_numpy(y1[train_idxs])[:, None]
y2_train = torch.from_numpy(y2[train_idxs])[:, None]
y3_train = torch.from_numpy(y3[train_idxs])[:, None]

idx_original_train = torch.from_numpy(idx_original[train_idxs])
In [ ]:
 
In [ ]:
y1_test = torch.from_numpy(y1[test_idxs])[:, None]
y2_test = torch.from_numpy(y2[test_idxs])[:, None]
y3_test = torch.from_numpy(y3[test_idxs])[:, None]

idx_original_test = torch.from_numpy(idx_original[test_idxs])
In [ ]:
 
In [ ]:
class CustomDataset(Dataset):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if use_gpu_num in [0, 1]: 
        torch.cuda.set_device(use_gpu_num)
    print(f"Using {device} device")
    
    def __init__(self, y1, y2, y3, #xs, 
                 idx_original,
                 
                 transform = None, target_transform = None):
        self.y1 = y1
        self.y2 = y2
        self.y3 = y3
#         self.xs = xs
        self.idx_original = idx_original
        self.transform = transform
        self.target_transform = target_transform    
    
    def __len__(self):
        return len(self.y1)
    
    def __getitem__(self, idx):
        y1_idx = self.y1[idx].to(device).float()
        y2_idx = self.y2[idx].to(device).float()
        y3_idx = self.y3[idx].to(device).float()
        
        
        # Change type of xs loaded !! ----------------------------------------
        # load in xs as they are needed.
        # Non-Hilbert Version
        save_path = '../models/10_TianEtAl2011/markers/'
        # Hilbert version
        # save_path = '../models/'+nb_name+'/hilbert/'
        save_file_path = save_path+'m'+str(int(self.idx_original[idx]))+'.npz'
        xs_idx = np.load(save_file_path)['arr_0']
        xs_idx = torch.from_numpy(xs_idx).to(device).float()
        xs_idx = xs_idx.squeeze()
        
        if self.transform:
            xs_idx = self.transform(xs_idx)
            
        if self.target_transform:
            y1_idx = self.transform(y1_idx)
            y2_idx = self.transform(y2_idx)
            y3_idx = self.transform(y3_idx)
        return xs_idx, y1_idx, y2_idx, y3_idx
Using cuda device
In [ ]:
training_dataloader = DataLoader(
    CustomDataset(
        y1 = y1_train,
        y2 = y2_train,
        y3 = y3_train,
        idx_original = idx_original_train
    ), 
    batch_size = 64, 
    shuffle = True)

testing_dataloader = DataLoader(
    CustomDataset(
        y1 = y1_test,
        y2 = y2_test,
        y3 = y3_test,
        idx_original = idx_original_train
    ), 
    batch_size = 64, 
    shuffle = True)
In [ ]:
xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
In [ ]:
# del training_dataloader
In [ ]:
# torch.cuda.empty_cache()
In [ ]:
xs_i.shape
Out[ ]:
torch.Size([64, 943455, 4])
In [ ]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()    
        self.x_network = nn.Sequential(
            nn.Flatten(),
            nn.Linear(3773820, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, x):
        x_out = self.x_network(x)
        return x_out

model = NeuralNetwork().to(device)
# print(model)
In [ ]:
 
In [ ]:
model(xs_i)[0:3] # try prediction on one batch
Out[ ]:
tensor([[0.8475],
        [0.8549],
        [0.7640]], device='cuda:1', grad_fn=<SliceBackward0>)
In [ ]:
def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
    size = len(dataloader.dataset)
    for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(xs_i)
        loss = loss_fn(pred, y1_i) # <----------------------------------------

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(y1_i) # <----------------
            if not silent:
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

                
def train_error(dataloader, model, loss_fn, silent = False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss = 0

    with torch.no_grad():
        for xs_i, y1_i, y2_i, y3_i in dataloader:
            pred = model(xs_i)
            train_loss += loss_fn(pred, y1_i).item() # <----------------------
            
    train_loss /= num_batches
    return(train_loss) 

            
def test_loop(dataloader, model, loss_fn, silent = False):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss = 0

    with torch.no_grad():
        for xs_i, y1_i, y2_i, y3_i in dataloader:
            pred = model(xs_i)
            test_loss += loss_fn(pred, y1_i).item() # <-----------------------

    test_loss /= num_batches
    if not silent:
        print(f"Test Error: Avg loss: {test_loss:>8f}")
    return(test_loss) 
In [ ]:
def train_nn(
    training_dataloader,
    testing_dataloader,
    model,
    learning_rate = 1e-3,
    batch_size = 64,
    epochs = 500
):
    # Initialize the loss function
    loss_fn = nn.MSELoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

    loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
    loss_df['TrainMSE'] = np.nan
    loss_df['TestMSE']  = np.nan

    for t in tqdm(range(epochs)):        
#         print(f"Epoch {t+1}\n-------------------------------")
        train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)

        loss_df.loc[loss_df.index == t, 'TrainMSE'
                   ] = train_error(training_dataloader, model, loss_fn, silent = True)
        
        loss_df.loc[loss_df.index == t, 'TestMSE'
                   ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
        
        if (t+1)%10: # Cache in case training is interupted
#             print(loss_df.loc[loss_df.index == t, ['TrainMSE', 'TestMSE']])
            torch.save(model.state_dict(), 
                       '../models/'+nb_name+'/model_'+str(t)+'_'+str(epochs)+'.pt') # convention is to use .pt or .pth
        
    return([model, loss_df])
In [ ]:
# don't run if either of these exist because there may be cases where we want the results but not the model

if not os.path.exists('../models/'+nb_name+'/model.pt'): 

    model, loss_df = train_nn(
        training_dataloader,
        testing_dataloader,
        model,
        learning_rate = 1e-3,
        batch_size = 64,
        epochs = 20
    )
    
    # experimental outputs:
    # 1. Model
    torch.save(model.state_dict(), '../models/'+nb_name+'/model.pt') # convention is to use .pt or .pth

    # 2. loss_df
    loss_df.to_csv('../reports/'+nb_name+'/loss_df.csv', index=False)  
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 20/20 [19:04<00:00, 57.23s/it]
In [ ]:
# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 1
# )
In [ ]:
# # don't run if either of these exist because there may be cases where we want the results but not the model
# #| os.path.exists('../reports/'+nb_name+'/loss_df.csv')

# if not os.path.exists('../models/'+nb_name+'/model.pt'): 

#     model, loss_df = train_nn(
#         training_dataloader,
#         testing_dataloader,
#         model,
#         learning_rate = 1e-3,
#         batch_size = 64,
#         epochs = 250
#     )
    
#     # experimental outputs:
#     # 1. Model
#     torch.save(model.state_dict(), '../models/'+nb_name+'/model.pt') # convention is to use .pt or .pth

#     # 2. loss_df
#     loss_df.to_csv('../reports/'+nb_name+'/loss_df.csv', index=False)
    
In [ ]:
# This uses 6650/8192 MiB Available
# Perhaps by using a larger batch size we can increase the used amount.
# Increasing the size of the model will require more memory so this should be monitored.

# for one iteration it takes 1/1 [04:28<00:00, 268.24s/it]
# NeuralNetwork(
#   (x_network): Sequential(
#     (0): Flatten(start_dim=1, end_dim=-1)
#     (1): Linear(in_features=3773820, out_features=64, bias=True)
#     (2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
#     (3): ReLU()
#     (4): Linear(in_features=64, out_features=1, bias=True)
#   )
# )

#     45/500 [3:22:20<34:19:22, 271.57s/it]
In [ ]:
loss_df = pd.read_csv('../reports/'+nb_name+'/loss_df.csv')
In [ ]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
                    mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
                    mode='lines', name='Test'))
fig.show()
In [ ]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=loss_df.Epoch, 
                         y= reverse_cs(loss_df.TrainMSE, 
                                       scale_dict['y1']),
                         mode='lines', name='Train'))
fig.add_trace(go.Scatter(x=loss_df.Epoch, 
                         y= reverse_cs(loss_df.TestMSE, 
                                       scale_dict['y1']),
                         mode='lines', name='Test'))
fig.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
# # ! conda install captum -c pytorch -y


# # imports from captum library
# from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
# from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation



# ig = IntegratedGradients(model)
# ig_nt = NoiseTunnel(ig)
# dl = DeepLift(model)
# gs = GradientShap(model)
# fa = FeatureAblation(model)

# ig_attr_test = ig.attribute(xs_test, n_steps=50)
# ig_nt_attr_test = ig_nt.attribute(xs_test)
# dl_attr_test = dl.attribute(xs_test)
# gs_attr_test = gs.attribute(xs_test, xs_train)
# fa_attr_test = fa.attribute(xs_test)

# [e.shape for e in [ig_attr_test,
# ig_nt_attr_test,
# dl_attr_test,
# gs_attr_test,
# fa_attr_test]]

# fig = go.Figure()
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = ig_nt_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = dl_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = gs_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = fa_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.show()

# len(dl_attr_test.cpu().detach().numpy().mean(axis = 0))















# ## Version 2, Predict `y1` (Anthesis), `y2` (Silking), and `y3` (ASI)

# Here each model will predict 3 values. The loss function is still mse, but the y tensors are concatenated

# class NeuralNetwork(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork, self).__init__()    
#         self.x_network = nn.Sequential(
#             nn.Linear(1106, 64),
#             nn.BatchNorm1d(64),
#             nn.ReLU(),
#             nn.Linear(64, 3))
        
#     def forward(self, x):
#         x_out = self.x_network(x)
#         return x_out

# model = NeuralNetwork().to(device)
# # print(model)

# xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# model(xs_i).shape # try prediction on one batch





# def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
#     size = len(dataloader.dataset)
#     for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
#         # Compute prediction and loss
#         pred = model(xs_i)
#         loss = loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)) # <----------------------------------------

#         # Backpropagation
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         if batch % 100 == 0:
#             loss, current = loss.item(), batch * len(y1_i) # <----------------
#             if not silent:
#                 print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

                
# def train_error(dataloader, model, loss_fn, silent = False):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     train_loss = 0

#     with torch.no_grad():
#         for xs_i, y1_i, y2_i, y3_i in dataloader:
#             pred = model(xs_i)
#             train_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <----------------------
            
#     train_loss /= num_batches
#     return(train_loss) 

            
# def test_loop(dataloader, model, loss_fn, silent = False):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     test_loss = 0

#     with torch.no_grad():
#         for xs_i, y1_i, y2_i, y3_i in dataloader:
#             pred = model(xs_i)
#             test_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <-----------------------

#     test_loss /= num_batches
#     if not silent:
#         print(f"Test Error: Avg loss: {test_loss:>8f}")
#     return(test_loss) 


# def train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# ):
#     # Initialize the loss function
#     loss_fn = nn.MSELoss()
#     optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#     loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
#     loss_df['TrainMSE'] = np.nan
#     loss_df['TestMSE']  = np.nan

#     for t in tqdm.tqdm(range(epochs)):
#         # print(f"Epoch {t+1}\n-------------------------------")
#         train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)

#         loss_df.loc[loss_df.index == t, 'TrainMSE'
#                    ] = train_error(training_dataloader, model, loss_fn, silent = True)
        
#         loss_df.loc[loss_df.index == t, 'TestMSE'
#                    ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
        
#     return([model, loss_df])

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 5000
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()





# '../ext_data/zma/panzea/phenotypes/'

# # pd.read_table('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212.txt', low_memory = False)

# # pd.read_excel('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212_TraitDescritptions.xlsx')
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
# data = pd.read_table('../ext_data/zma/panzea/phenotypes/Buckler_etal_2009_Science_flowering_time_data-090807/markergenotypes062508.txt', skiprows=1
#                     ).reset_index().rename(columns = {'index': 'Geno_Code'})
# data
In [ ]:
# px.scatter_matrix(data.loc[:, ['days2anthesis', 'days2silk', 'asi']])
In [ ]:
# d2a = np.array(data['days2anthesis'])
# d2s = np.array(data['days2silk'])
# asi = np.array(data['asi'])
In [ ]:
# xs = np.array(data.drop(columns = ['days2anthesis', 'days2silk', 'asi', 'pop', 'Geno_Code']))

# n_obs = xs.shape[0]

# np_seed = 9070707
# rng = np.random.default_rng(np_seed)  # can be called without a seed

# test_pr = 0.2

# test_n = round(n_obs*test_pr)
# idxs = np.linspace(0, n_obs-1, num = n_obs).astype(int)
# rng.shuffle(idxs)

# test_idxs = idxs[0:test_n]
# train_idxs = idxs[test_n:-1]

Make tensors¶

In [ ]:
# y1_train = torch.from_numpy(y1[train_idxs]).to(device).float()[:, None]
# y2_train = torch.from_numpy(y2[train_idxs]).to(device).float()[:, None]
# y3_train = torch.from_numpy(y3[train_idxs]).to(device).float()[:, None]
# xs_train = torch.from_numpy(xs[train_idxs]).to(device).float()

# y1_test = torch.from_numpy(y1[test_idxs]).to(device).float()[:, None]
# y2_test = torch.from_numpy(y2[test_idxs]).to(device).float()[:, None]
# y3_test = torch.from_numpy(y3[test_idxs]).to(device).float()[:, None]
# xs_test = torch.from_numpy(xs[test_idxs]).to(device).float()
In [ ]:
# class CustomDataset(Dataset):
#     def __init__(self, y1, y2, y3, xs, transform = None, target_transform = None):
#         self.y1 = y1
#         self.y2 = y2
#         self.y3 = y3
#         self.xs = xs
#         self.transform = transform
#         self.target_transform = target_transform    
    
#     def __len__(self):
#         return len(self.y1)
    
#     def __getitem__(self, idx):
#         y1_idx = self.y1[idx]
#         y2_idx = self.y2[idx]
#         y3_idx = self.y3[idx]
#         xs_idx = self.xs[idx]
        
#         if self.transform:
#             xs_idx = self.transform(xs_idx)
            
#         if self.target_transform:
#             y1_idx = self.transform(y1_idx)
#             y2_idx = self.transform(y2_idx)
#             y3_idx = self.transform(y3_idx)
#         return xs_idx, y1_idx, y2_idx, y3_idx
In [ ]:
# training_dataloader = DataLoader(
#     CustomDataset(
#         y1 = y1_train,
#         y2 = y2_train,
#         y3 = y3_train,
#         xs = xs_train
#     ), 
#     batch_size = 64, 
#     shuffle = True)

# testing_dataloader = DataLoader(
#     CustomDataset(
#         y1 = y1_test,
#         y2 = y2_test,
#         y3 = y3_test,
#         xs = xs_test
#     ), 
#     batch_size = 64, 
#     shuffle = True)

# xs.shape

Version 1, Predict y1 (Anthesis)¶

In [ ]:
# class NeuralNetwork(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork, self).__init__()    
#         self.x_network = nn.Sequential(
#             nn.Linear(1106, 64),
#             nn.BatchNorm1d(64),
#             nn.ReLU(),
#             nn.Linear(64, 1))
        
#     def forward(self, x):
#         x_out = self.x_network(x)
#         return x_out

# model = NeuralNetwork().to(device)
# # print(model)

# xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# model(xs_i).shape # try prediction on one batch

# def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
#     size = len(dataloader.dataset)
#     for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
#         # Compute prediction and loss
#         pred = model(xs_i)
#         loss = loss_fn(pred, y1_i) # <----------------------------------------

#         # Backpropagation
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         if batch % 100 == 0:
#             loss, current = loss.item(), batch * len(y1_i) # <----------------
#             if not silent:
#                 print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

                
# def train_error(dataloader, model, loss_fn, silent = False):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     train_loss = 0

#     with torch.no_grad():
#         for xs_i, y1_i, y2_i, y3_i in dataloader:
#             pred = model(xs_i)
#             train_loss += loss_fn(pred, y1_i).item() # <----------------------
            
#     train_loss /= num_batches
#     return(train_loss) 

            
# def test_loop(dataloader, model, loss_fn, silent = False):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     test_loss = 0

#     with torch.no_grad():
#         for xs_i, y1_i, y2_i, y3_i in dataloader:
#             pred = model(xs_i)
#             test_loss += loss_fn(pred, y1_i).item() # <-----------------------

#     test_loss /= num_batches
#     if not silent:
#         print(f"Test Error: Avg loss: {test_loss:>8f}")
#     return(test_loss) 


# def train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# ):
#     # Initialize the loss function
#     loss_fn = nn.MSELoss()
#     optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#     loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
#     loss_df['TrainMSE'] = np.nan
#     loss_df['TestMSE']  = np.nan

#     for t in tqdm.tqdm(range(epochs)):
#         # print(f"Epoch {t+1}\n-------------------------------")
#         train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)

#         loss_df.loc[loss_df.index == t, 'TrainMSE'
#                    ] = train_error(training_dataloader, model, loss_fn, silent = True)
        
#         loss_df.loc[loss_df.index == t, 'TestMSE'
#                    ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
        
#     return([model, loss_df])

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()

# # ! conda install captum -c pytorch -y


# # imports from captum library
# from captum.attr import LayerConductance, LayerActivation, LayerIntegratedGradients
# from captum.attr import IntegratedGradients, DeepLift, GradientShap, NoiseTunnel, FeatureAblation



# ig = IntegratedGradients(model)
# ig_nt = NoiseTunnel(ig)
# dl = DeepLift(model)
# gs = GradientShap(model)
# fa = FeatureAblation(model)

# ig_attr_test = ig.attribute(xs_test, n_steps=50)
# ig_nt_attr_test = ig_nt.attribute(xs_test)
# dl_attr_test = dl.attribute(xs_test)
# gs_attr_test = gs.attribute(xs_test, xs_train)
# fa_attr_test = fa.attribute(xs_test)

# [e.shape for e in [ig_attr_test,
# ig_nt_attr_test,
# dl_attr_test,
# gs_attr_test,
# fa_attr_test]]

# fig = go.Figure()
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = ig_nt_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = dl_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = gs_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.add_trace(go.Scatter(x = np.linspace(0, 1106-1, 1106),
#                          y = fa_attr_test.cpu().detach().numpy().mean(axis=0),
#                          mode='lines', name='Test'))
# fig.show()

# len(dl_attr_test.cpu().detach().numpy().mean(axis = 0))















# ## Version 2, Predict `y1` (Anthesis), `y2` (Silking), and `y3` (ASI)

# Here each model will predict 3 values. The loss function is still mse, but the y tensors are concatenated

# class NeuralNetwork(nn.Module):
#     def __init__(self):
#         super(NeuralNetwork, self).__init__()    
#         self.x_network = nn.Sequential(
#             nn.Linear(1106, 64),
#             nn.BatchNorm1d(64),
#             nn.ReLU(),
#             nn.Linear(64, 3))
        
#     def forward(self, x):
#         x_out = self.x_network(x)
#         return x_out

# model = NeuralNetwork().to(device)
# # print(model)

# xs_i, y1_i, y2_i, y3_i = next(iter(training_dataloader))
# model(xs_i).shape # try prediction on one batch





# def train_loop(dataloader, model, loss_fn, optimizer, silent = False):
#     size = len(dataloader.dataset)
#     for batch, (xs_i, y1_i, y2_i, y3_i) in enumerate(dataloader):
#         # Compute prediction and loss
#         pred = model(xs_i)
#         loss = loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)) # <----------------------------------------

#         # Backpropagation
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()

#         if batch % 100 == 0:
#             loss, current = loss.item(), batch * len(y1_i) # <----------------
#             if not silent:
#                 print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

                
# def train_error(dataloader, model, loss_fn, silent = False):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     train_loss = 0

#     with torch.no_grad():
#         for xs_i, y1_i, y2_i, y3_i in dataloader:
#             pred = model(xs_i)
#             train_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <----------------------
            
#     train_loss /= num_batches
#     return(train_loss) 

            
# def test_loop(dataloader, model, loss_fn, silent = False):
#     size = len(dataloader.dataset)
#     num_batches = len(dataloader)
#     test_loss = 0

#     with torch.no_grad():
#         for xs_i, y1_i, y2_i, y3_i in dataloader:
#             pred = model(xs_i)
#             test_loss += loss_fn(pred, torch.concat([y1_i, y2_i, y3_i], axis = 1)).item() # <-----------------------

#     test_loss /= num_batches
#     if not silent:
#         print(f"Test Error: Avg loss: {test_loss:>8f}")
#     return(test_loss) 


# def train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# ):
#     # Initialize the loss function
#     loss_fn = nn.MSELoss()
#     optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#     loss_df = pd.DataFrame([i for i in range(epochs)], columns = ['Epoch'])
#     loss_df['TrainMSE'] = np.nan
#     loss_df['TestMSE']  = np.nan

#     for t in tqdm.tqdm(range(epochs)):
#         # print(f"Epoch {t+1}\n-------------------------------")
#         train_loop(training_dataloader, model, loss_fn, optimizer, silent = True)

#         loss_df.loc[loss_df.index == t, 'TrainMSE'
#                    ] = train_error(training_dataloader, model, loss_fn, silent = True)
        
#         loss_df.loc[loss_df.index == t, 'TestMSE'
#                    ] = test_loop(testing_dataloader, model, loss_fn, silent = True)
        
#     return([model, loss_df])

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 500
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()

# model, loss_df = train_nn(
#     training_dataloader,
#     testing_dataloader,
#     model,
#     learning_rate = 1e-3,
#     batch_size = 64,
#     epochs = 5000
# )

# fig = go.Figure()
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TrainMSE,
#                     mode='lines', name='Train'))
# fig.add_trace(go.Scatter(x=loss_df.Epoch, y=loss_df.TestMSE,
#                     mode='lines', name='Test'))
# fig.show()





# '../ext_data/zma/panzea/phenotypes/'

# # pd.read_table('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212.txt', low_memory = False)

# # pd.read_excel('../ext_data/zma/panzea/phenotypes/traitMatrix_maize282NAM_v15-130212_TraitDescritptions.xlsx')
In [ ]: